In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
In [2]:
products = pd.read_csv('../data/amazon_baby.csv')
In [3]:
products.head()
Out[3]:
In [4]:
len(products)
Out[4]:
In [5]:
# Sklearn does not work well with empty fields, so we're dropping all rows that have empty fields
products = products.dropna()
len(products)
Out[5]:
Here Sklearn works different from the Graphlab. Word counts are recorded in a sparse matrix, where every column is a unique word and every row is a review. For demonstration purposes and to stay in line with the lecture, the word_counts column is added here, but this is not actually used in the model later on. Instead, the word count vector cv will be used.
In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
In [7]:
cv.fit(products['review']) # Create the word count vector
products['word_counts'] = cv.transform(products['review'])
In [8]:
products.head()
Out[8]:
In [9]:
products['name'].describe()
Out[9]:
The total number of reviews is lower than in the lecture video. Likely due to dropping the reviews with NA's.
In [10]:
giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']
In [11]:
len(giraffe_reviews)
Out[11]:
In [12]:
giraffe_reviews['rating'].hist()
Out[12]:
In [13]:
giraffe_reviews['rating'].value_counts()
Out[13]:
In [14]:
# Ignore all 3* review
products = products[products['rating'] != 3]
In [15]:
products['sentiment'] = products['rating'] >= 4
In [16]:
products.head()
Out[16]:
In [17]:
from sklearn.cross_validation import train_test_split
# Due to the random divide between the train and test data, the model will be
# slightly different from the lectures from here on out.
train_data, test_data = train_test_split(products, test_size=0.2, random_state=42)
In [18]:
from sklearn.linear_model import LogisticRegression
cv.fit(train_data['review']) # Use the count vector, but fit only the train data
sentiment_model = LogisticRegression().fit(cv.transform(train_data['review']), train_data['sentiment'])
In [19]:
# Predict sentiment for the test data, based on the sentiment model
# The cv.transform is necessary to get the test_data review data in the right format for the model
predicted = sentiment_model.predict(cv.transform(test_data['review']))
In [20]:
from sklearn import metrics
# These metrics will be slightly different then in the lecture, due to the different
# train/test data split and differences in how the model is fitted
print ("Accuracy:", metrics.accuracy_score(test_data['sentiment'], predicted))
print ("ROC AUC Score:", metrics.roc_auc_score(test_data['sentiment'], predicted))
print ("Confusion matrix:")
print (metrics.confusion_matrix(test_data['sentiment'], predicted))
print (metrics.classification_report(test_data['sentiment'], predicted))
In [21]:
# for the ROC curve, we need the prediction probabilities rather than the True/False values
# which are obtained by using the .predict_proba function instead of .predict
predicted_probs = sentiment_model.predict_proba(cv.transform(test_data['review']))
In [22]:
false_positive_rate, true_positive_rate, _ = metrics.roc_curve(test_data['sentiment'], predicted_probs[:,1])
In [23]:
plt.plot(false_positive_rate, true_positive_rate)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Sentiment Analysis')
plt.show()
In [24]:
giraffe_reviews['predicted_sentiment'] = sentiment_model.predict_proba(cv.transform(giraffe_reviews['review']))[:,1]
In [25]:
giraffe_reviews.head()
Out[25]:
In [26]:
giraffe_reviews.sort_values(by='predicted_sentiment', inplace=True, ascending=False)
In [27]:
# Despite the slightly different model, the same review is ranked highest in predicted sentiment
giraffe_reviews.head(10)
Out[27]:
In [28]:
giraffe_reviews.iloc[0]['review']
Out[28]:
In [29]:
giraffe_reviews.tail(10)
## We can see the lowest scoring review in the lecture is ranked 10th lowest in this analysis
Out[29]:
In [30]:
giraffe_reviews.iloc[-1]['review']
Out[30]: